/*** CREATES TRACKING MEASURES BY CAMPUS-GRADE-YEAR ***/

clear all
set more 1

do "paths.do"
cd "$WORKING"

** it is possible to set multiple scores types to get tracking measures for.
** we only use z-scores.
global SCORE_TYPES z

** set number of permutations to run for Purposeful Assigment
global NSIMPURP 1000
timer clear

/*
R-squared: Actual R2 from regression of prior scores on class indicators
Sorting index:
 Collins&Gan(2013) "does sorting students improve scores? An analysis of class composition"
 j-class, k-campus
 a_1k: s.d. of test scores within a school, defined as (1/N)*sum(deviations from overall mean)
 a_2k: square root of (enrollment weighted average of (variance of test scores in each class j))
 sort_k=a_1k/a_2k
 Note: we adjusted a_2k to be (1/N)*sum(deviations from class mean), so captures extent of
  tracking for typical student rather than typical classroom
In addition to actual measures, we create two simluated distributions for standardizing:
 i) Repeatedly randomly assigning students to classes maintaining actual numbers and sizes of classes
 ii) Repeatedly reordering classes and allocating students to these strictly by test score rank

This script divides the dataset into chunks to most efficiently calculate the
Purposeful Assigment algorithm. It exploits the fact that cohorts with similar
numbers of classes require a similar number of calculations.
 
Run time: around 3 days
*/


/************************************************
Subsets of tracking_6 for use in this script
************************************************/

foreach ss in math ela sci soc {
	foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
		forval g = 4(1)8{
			capture confirm file "$WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta"
			if (_rc == 0) {
				di "Yes $WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta already exists"
			}
			else {
				di "No $WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta does not exist"
				use campus distnum geo_distnum distnum0 grade year Sservicex serv_consol Sclass_id LMscore zLMscore Dwhite Dblack Dhisp Dasian Dother Pdisadv id1 ///
						if year==`year' & grade==`g' using "$WORKING\tracking_6-`ss'", clear
				save "$WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta"
			}
		}
	}

	capture confirm file "$WORKING\tracking_7 pieces\tracking_7_Z_`ss'.dta"
	if (_rc == 0) {
		di "Yes $WORKING\tracking_7 pieces\tracking_7_Z_`ss'.dta already exists"
	}
	else {
		di "No $WORKING\tracking_7 pieces\tracking_7_Z_`ss'.dta does not exist"
		
		use "$WORKING\tracking_6-`ss'", clear
		
		keep campus grade year C* Sclass_id zLMscore
		
		egen class_sd = sd(zLMscore), by(campus grade year Sclass_id)
		replace class_sd = 0 if (class_sd == .) & (zLMscore != .)
		egen CGavgclssd = mean(class_sd), by(campus grade year)
		
		// drop *_bycls *_bysvc
		drop *_by_svc class_sd
		duplicates drop campus grade year, force

		save "$WORKING\tracking_7 pieces\tracking_7_Z_`ss'.dta"
	}
}


/************************************************
LOOP OVER SUBJECT AND YEAR AND GRADE
************************************************/

** loop over subject
** foreach ss in math ela sci soc {
foreach ss in soc {
	** loop over year
	** foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
	foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
		local skip = (("`ss'" == "soc") & (`year' < 2018))
		if (`skip') {
			continue
		}
		
		** loop over grade
		** forval g = 4(1)8{
		forval g = 4(1)8{
			capture log close
			log using "$LOGFILES\tracking_7\tracking_7_`year'_`g'_`ss'.log", replace
		
			timer on 1
			
			display "`ss' `year' `g'"
			
			/*
			Tracking measures are calculated at both the level of the sorting between
			schools within districts and the sorting between classes within schools.
			Naming of variables follows conceptually from the classes-within-schools
			calculations.
			*/
			foreach trk_lev in dis_sch sch_cls {
				
				use "$WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta", clear
				
				if ("`trk_lev'" == "dis_sch"){
					keep distnum0 grade year campus Sservicex serv_consol zLMscore id1
					drop if distnum0 == .
					rename campus Sclass_id
					gen campus = distnum0
				}
				else{
					keep distnum0 campus grade year Sclass_id Sservicex serv_consol zLMscore id1
				}
				
				** calculate Herfindahl concentration index on course names (Sservicex)
				sort campus Sclass_id
				by campus: gen tmp0=_N
				gen tmp1 = 1
				egen tmp2 = sum(tmp1), by(campus serv_consol)
				egen tmp3 = sum(tmp2), by(campus)
				gen hindex_predrop = tmp3 / (tmp0 ^ 2)
				drop tmp*
				
				disp "/*** CREATE VARIABLES WITH NUMBERS AND SIZES OF CLASSES BY CAMPUS ***/"
				
				** drop if missing prior test score
				drop if zLMscore==.

				** drop if class size=1
				bysort campus Sclass_id: gen tmp1=_N
				gen tmponestd=(tmp1==1)
				summ tmponestd
				drop if tmponestd==1
				
				** drop if only one class w/2+ students
				bysort campus: gen tmp2=_N
				gen tmponecls=(tmp1==tmp2)
				summ tmponecls
				drop if tmponecls==1
				drop tmp*

				** create identifiers and enrollment counts for schools and classes
				bysort campus Sclass_id: gen tmp1=_n
				count if tmp1==1
				sort campus Sclass_id
				egen gsch=group(campus)
				egen gcls=group(gsch Sclass_id)
				egen clstag=tag(gcls)
				egen numcls=total(clstag), by(gsch)
				bys gsch: gen schsz=_N
				bys gcls: gen clssz=_N
				summ g* numcls schsz clssz clstag
				* A = number of campuses
				summ gsch
				scalar A=r(max)
				* B = maximum number of classes for any school in grade
				summ numcls
				scalar B=r(max)
				drop Sclass_id tmp*

				** number classes w/in schools (from 1 to the # of classes)
				* here we number classes starting with the largest
				gsort gsch -clstag -clssz gcls
				bys gsch clstag: gen tmp1=_n if clstag==1
				egen clsnum=max(tmp1), by(gcls)
				egen tmp2=max(clsnum), by(gsch)
				assert tmp2==numcls
				drop tmp*
				 
				** create class size distribution variables
				* clsszX is size of class #X, and cumclssszX cumulates students across classes 1 to X
				forvalues i=1(1)`=B'{
				 local j=`i'-1
				 gen tmp1=clssz if clsnum==`i'
				 egen clssz`i'=max(tmp1), by(gsch)
				 if `i'==1{
				  gen cumclssz1=clssz1
				 }
				 if `i'>1{
				  gen cumclssz`i'=clssz`i'+cumclssz`j'
				 }
				 drop tmp*
				}
				summ clssz* cumclssz*
				
				** calculate Herfindahl concentration index on course names (Sservicex)
				sort gsch gcls
				gen tmp1 = 1
				egen tmp2 = sum(tmp1), by(gsch serv_consol)
				egen tmp3 = sum(tmp2), by(gsch)
				gen hindex_postdrop = tmp3 / (schsz ^ 2)
				drop tmp*
				
				disp "/*** CALCULATE ACTUAL R-SQUARED INDICES ***/"
				
				foreach vv in $SCORE_TYPES {
					** overall std dev of scores
					egen tmp1=sd(`vv'LMscore), by(gsch)
					* adjust std dev by dividing by N rather than N-1
					* in stata, variance is (1/(N-1))*(sum of deviations from the mean), and std dev is sqrt,
					* so need to adjust
					gen `vv'NUM=tmp1*sqrt((schsz-1)/schsz)

					** std dev of student scores from classroom averages
					egen tmp2=mean(`vv'LMscore), by(gcls)
					qui replace tmp2=(`vv'LMscore-tmp2)^2
					egen tmp3=sum(tmp2), by(gsch)
					qui replace tmp3=sqrt(tmp3/schsz)

					** calculate the sorting index
					gen `vv'trackS=`vv'NUM/tmp3
					drop tmp*

					** calculate the R-squared
					egen y_bar = mean(`vv'LMscore), by(gsch)
					egen y_hat = mean(`vv'LMscore), by(gcls)
					gen yh_yb2 = (y_hat - y_bar)^2
					gen y_yb2 = (`vv'LMscore - y_bar)^2
					egen r2_num = sum(yh_yb2), by(gsch)
					egen r2_denom = sum(y_yb2), by(gsch)
					gen `vv'trackR = r2_num / r2_denom
					drop y_bar y_hat yh_yb2 y_yb2 r2_num r2_denom
					
					** recalculate the tracking measures using the minimum loop factors
					
					gen tmp1 = `vv'LMscore ^ 2
					egen `vv'iota_s = sum(tmp1), by(gsch)
					drop tmp1

					egen tmp1 = sum(`vv'LMscore), by(gsch)
					gen `vv'lambda_s = (tmp1 ^ 2) / schsz
					drop tmp1

					egen tmp1 = sum(`vv'LMscore), by(gcls)
					gen tmp2 = (tmp1 / clssz) ^ 2
					egen `vv'kappa_s = sum(tmp2), by(gsch)
					drop tmp1 tmp2
					
					** verify that `vv'trackR = `vv'trackR_alt
					gen `vv'trackR_alt = (`vv'kappa_s - `vv'lambda_s) / (`vv'iota_s - `vv'lambda_s)
					
					/*
					Generate moments of the distribution of tracking under "random
					assignment". Students are assigned randomly to classes, but
					class sizes stay the same as true enrollments. Calculate mean
					and sd of those distributions analytically, rather than using
					simulations, using the first four moments of the score
					distribution within a cohort.
					*/
					egen Ex = mean(`vv'LMscore), by(gsch)
					egen Ex2 = mean(`vv'LMscore ^ 2), by(gsch)
					egen Ex3 = mean(`vv'LMscore ^ 3), by(gsch)
					egen Ex4 = mean(`vv'LMscore ^ 4), by(gsch)
					
					** schsz >= 4, so all fourth moments are defined
					gen Exixixixj = (Ex * Ex3 * (schsz / (schsz - 1))) - (Ex4 / (schsz - 1))
					gen Exixixjxj = (Ex2 * Ex2 * (schsz / (schsz - 1))) - (Ex4 / (schsz - 1))
					gen Exixixjxk = ( ((schsz ^ 2) * (Ex ^ 2) * Ex2) + (2 * Ex4) - ///
							((schsz * (Ex2 ^ 2)) + (2 * schsz * Ex * Ex3)) ) / ///
							((schsz - 1) * (schsz - 2))
					gen Exixjxkxl = ( ((schsz ^ 3) * (Ex ^ 4)) + (8 * schsz * Ex * Ex3) + (3 * schsz * (Ex2 ^ 2)) - ///
							((6 * (schsz ^ 2) * (Ex ^ 2) * Ex2) + (6 * Ex4)) ) / ///
							((schsz - 1) * (schsz - 2) * (schsz - 3))
					
					** clssz >= 2, so psi_43 and psi_44 may equal zero
					egen tmp1 = sum(clssz ^ 2) if clstag, by(gsch)
					egen tmp2 = sum((clssz - 1) ^ 2) if clstag, by(gsch)
					gen psi_0 = (numcls * (numcls - 1))
					gen psi_1 = ((schsz - numcls) ^ 2) - tmp2
					gen psi_2 = (schsz ^ 2) + tmp2 - (((schsz - numcls) ^ 2) + tmp1 + psi_0)
					egen psi_3 = sum(1 / (clssz ^ 1)) if clstag, by(gsch)
					egen psi_43 = sum(((clssz - 1) * (clssz - 2) * (clssz - 3)) / (clssz ^ 1)) if clstag, by(gsch)
					egen psi_42 = sum(((clssz - 1) * (clssz - 2)) / (clssz ^ 1)) if clstag, by(gsch)
					egen psi_41 = sum(((clssz - 1)) / (clssz ^ 1)) if clstag, by(gsch)
					
					gen EK2 = ((psi_0 + (3 * psi_41)) * Exixixjxj) + ///
							((psi_1 + psi_43) * Exixjxkxl) + ///
							((psi_2 + (6 * psi_42)) * Exixixjxk) + ///
							(psi_3 * Ex4) + (4 * psi_41 * Exixixixj)

					gen `vv'trackRavg = (numcls - 1) / (schsz - 1)
					gen EK = `vv'lambda_s + (`vv'trackRavg * (`vv'iota_s - `vv'lambda_s))
					gen Erho2 = (EK2 + (`vv'lambda_s ^ 2) - (2 * `vv'lambda_s * EK)) / ((`vv'iota_s - `vv'lambda_s) ^ 2)
					gen Vrho = Erho2 - (`vv'trackRavg ^ 2)
					gen tmp3 = sqrt(Vrho)
					egen `vv'trackRsd = min(tmp3), by(gsch)
					drop tmp*
					
					rename Ex `vv'Ex
					rename Ex2 `vv'Ex2
					rename Ex3 `vv'Ex3
					rename Ex4 `vv'Ex4
					
					drop psi_* EK2 EK Erho2 Vrho Exix*
				}
				
				save "$TEMPFILES\tracking_7_A.dta", replace

				timer off 1
				
				*** central loop for simulations *********************************************************************
				
				set seed 1312018

				** loop separately by deciles of numbers of classes in school x grade x year

				foreach nc in 1 2 3 4 5 6 7 8 9 10 {

					timer on 2

					** identify and keep classes within the relevant decile

					global has_`nc' = 0
					local nc_low = floor((`nc' - 1) * (B / 10))
					local nc_high = floor((`nc') * (B / 10))

					if(`nc' == 10){
						di "These should be same"
						di B
						di `nc_high'
					}
					
					use "$TEMPFILES\tracking_7_A.dta", clear
					keep if (numcls > `nc_low') & (numcls <= `nc_high')
					
					if (_N == 0) {
						di "$S_DATE $S_TIME: Skip loop iteration `year' `g', nc_low=`nc_low', nc_high=`nc_high'"
						continue
					}
					
					global has_`nc' = 1
					summ numcls
					scalar D=r(max)
					
					timer off 2
					
					disp "/*** SIMULATE INDICES UNDER PURPOSEFUL ASSIGNMENT ***/"
					
					timer on 5
					
					** sort students by ability within school, and assign ordered student number
					gsort gsch -zLMscore
					bysort gsch: gen studrnk=_n
					
					foreach vv in $SCORE_TYPES {
						** setting up component needed to calculate mean and std dev of both measures across permutations			
						gen double `vv'pfact1 = 0
					}
					save "$TEMPFILES\tracking_7_B.dta", replace

					** create campus x permutation dataset with randomly assigned class orderings
					quietly{
						keep if clstag == 1
						keep campus gcls clssz
						gen nsim = $NSIMPURP
						expand nsim

						bysort campus gcls: gen sim = _n
						sort campus sim gcls
						gen tmp1 = uniform()
						egen clsrnk = rank(tmp1), by(campus sim) unique

						sort campus sim clsrnk
						by campus sim: gen simcumclssz = sum(clssz)

						keep campus sim clsrnk simcumclssz
						reshape wide simcumclssz, i(campus sim) j(clsrnk)
					}
					save "$TEMPFILES\tracking_7_sim.dta", replace

					timer off 5

					** loop over permutations, augmenting the component sums each time

					use "$TEMPFILES\tracking_7_B.dta", clear
					
					forvalues n = 1/$NSIMPURP{
						timer on 6
						
						if (mod(`n', 50) == 1){
							di "$S_DATE $S_TIME: Purposeful assignment part #`nc' iteration #`n'"
						}
						
						quietly{
							timer on 61
							gen sim = `n'
							qui merge m:1 campus sim using "$TEMPFILES\tracking_7_sim.dta", keep(match)
							drop _merge
							timer off 61
							
							timer on 62
							gen tmpclsnum=1
							forvalues i=1(1) `=D-1'{
								qui replace tmpclsnum=`i'+1 if studrnk>simcumclssz`i'
							}
							timer off 62
							
							timer on 63
							egen tmpgcls=group(gsch tmpclsnum)
							drop sim tmpclsnum simcumclssz*
							egen tmpclssz = count(grade), by(tmpgcls)
							timer off 63
							
							timer on 64
							foreach vv in $SCORE_TYPES {
								egen tmp1 = sum(`vv'LMscore), by(tmpgcls)
								gen tmp2 = (tmp1 / tmpclssz) ^ 2
								egen kappa_sp = sum(tmp2), by(gsch)
								
								replace `vv'pfact1 = `vv'pfact1 + kappa_sp
								
								drop tmp1 tmp2 kappa_sp // ppfact*
							}
							timer off 64
							
							drop tmpclssz tmpgcls
						}
						timer off 6
					}

					** calculate mean and std deviation of both measures
					timer on 7
					
					foreach vv in $SCORE_TYPES {
						gen double `vv'trackRavg_max = ((`vv'pfact1 / $NSIMPURP) - `vv'lambda_s) / (`vv'iota_s - `vv'lambda_s)
						drop `vv'pfact*
					}
					timer off 7

					timer on 8
					save "$TEMPFILES\tracking_7_C_`nc'.dta", replace
					timer off 8
				}

				*** end of central loop *********************************************************************
				** combine datasets that were split into number-of-classes deciles
				timer on 9
				clear
				foreach nc in 1 2 3 4 5 6 7 8 9 10 {
					if ${has_`nc'} {
						append using "$TEMPFILES\tracking_7_C_`nc'.dta"
					}
				}
				timer off 9
				
				** calculate average class size
				egen avgclssz=mean(clssz), by(campus grade year)
				
				save "$TEMPFILES\tracking_7_D.dta", replace
				timer list
				
				** keep one observation per campus (x grade x year)
				timer on 10

				egen tmp1 = tag(campus grade year)
				keep if tmp1 == 1
				drop tmp1
				keep distnum0 campus grade year *track* schsz numcls hindex_predrop hindex_postdrop clssz* avgclssz
				drop clssz
				
				foreach aa in $SCORE_TYPES {
					la var `aa'trackR "actual R2 tracking measure (campus x grade x year)"
					la var `aa'trackRavg "analytical expected R2 across simulations, randsom assn"
					la var `aa'trackRsd "analytical std dev R2 across simulations, random assn"
					la var `aa'trackRavg_max "average R2 across simulations, full tracking"
				}
				
				la var schsz "number of students in tracking calculation"
				la var numcls "number of classes"
				la var clssz1 "size of 1st class"
				la var avgclssz "average class size (student-weighted)"

				disp "/*** SAVE DATASET ***/"
				
				if ("`trk_lev'" == "dis_sch") {
					keep distnum0 grade year *track* numcls
					
					foreach vv in R Ravg Rsd Ravg_max {
						rename ztrack`vv' dtrack`vv'
					}
				}

				save "$WORKING\tracking_7 results\tracking_7_`year'_`g'_`ss'_`trk_lev'", replace
				
				timer off 10
			}
			
			
			timer list
		}
	}
}

/************************************************
COMBINE SEPARATE YEAR-GRADE DATASETS AND ADD OTHER CAMPUS-GRADE-YEAR VARIABLES
************************************************/

capture log close
log using "$LOGFILES\tracking_7_post.log", replace


** loop over years and grades to combine datasets
local k = 0
foreach ss in math ela sci soc {
	foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
		forval g=4(1)8{
			display "`year'"
			di `g' + 100
			local k = `k' + 1
			if (`k' == 1) {
				use "$WORKING\tracking_7 results\tracking_7_`year'_`g'_`ss'_dis_sch", clear
				drop if year~=`year'
				gen subj = "`ss'"
			}
			else {
				append using "$WORKING\tracking_7 results\tracking_7_`year'_`g'_`ss'_dis_sch"
				replace subj = "`ss'" if subj == ""
			}
		}
	}
}
save "$WORKING\tracking_7_dis_sch", replace


** loop over years and grades to combine datasets
local k = 0
foreach ss in math ela sci soc {
	foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
		forval g=4(1)8{
			display "`year'"
			di `g' + 100
			local k = `k' + 1
			if (`k' == 1) {
				use "$WORKING\tracking_7 results\tracking_7_`year'_`g'_`ss'_sch_cls", clear
				drop if year~=`year'
				gen subj = "`ss'"
			}
			else {
				append using "$WORKING\tracking_7 results\tracking_7_`year'_`g'_`ss'_sch_cls"
				replace subj = "`ss'" if subj == ""
			}
		}
	}
}
save "$WORKING\temp\tracking_7_0", replace


local k = 0
foreach ss in math ela sci soc {
	local k = `k' + 1
	if (`k' == 1) {
		use "$WORKING\tracking_7 pieces\tracking_7_Z_`ss'", clear
		gen subj = "`ss'"
	}
	else {
		append using "$WORKING\tracking_7 pieces\tracking_7_Z_`ss'"
		replace subj = "`ss'" if subj == ""
	}
}

destring campus, replace
save "$WORKING\temp\tracking_7_1", replace


use "$WORKING\temp\tracking_7_1", clear

merge 1:1 subj campus grade year using "$WORKING\temp\tracking_7_0"
drop if _merge != 3
drop _merge

save "$WORKING\tracking_7_2", replace


** get full set of class sizes by cohort
use "$WORKING\tracking_7_2", clear
keep subj campus grade year clssz*
reshape long clssz, i(subj campus grade year) j(clsnum)

rename clssz oldclssz
keep if oldclssz != .
sort subj campus grade year oldclssz
by subj campus grade year: gen cumclssz = sum(oldclssz)
gen cumclssz_prev = cumclssz - oldclssz
gen clssz = oldclssz
gsort +subj +campus +grade +year -oldclssz -cumclssz
by subj campus grade year: replace clsnum = _n

keep subj campus grade year clsnum clssz
reshape wide clssz, i(subj campus grade year) j(clsnum)

save "$WORKING\temp\tracking_7_3.dta", replace

use "$WORKING\tracking_7_2", clear
drop clssz*
merge 1:1 subj campus grade year using "$WORKING\temp\tracking_7_3.dta"
assert _merge == 3
drop _merge

save "$WORKING\tracking_7", replace

log close
